#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@创建时间    : 2022/11/3  11:16
@作者  : st
@文件名: data_process_nlpcc.py
@项目名: PyCharm
@文件描述:nlpcc2015 数据处理
    
"""
import os.path

from constant import matadata_path


def get_to_text():
    file_path = os.path.join(matadata_path, 'nlpcc_data', 'trainSeg.txt')
    text_list = []
    res_list = []
    temp_text = ''
    temp_words = []
    temp_char = ''
    with open(file_path, 'r', encoding='utf-8') as f:
        fl = f.readlines()
        line_num = len(fl)
        for index, line in enumerate(fl):
            line = line.strip()
            # 一句文本结束
            if not line or index == line_num-1:
                text_list.append(temp_text)
                res_list.append(temp_words)
                temp_text = ''
                temp_words = []
                continue
            ls = line.split('\t')
            if len(ls) != 2:
                continue
            char, tag = ls
            temp_text += char
            if tag == 'S':
                temp_words.append(char)
            elif tag == 'B':
                temp_char += char
            elif tag == 'E':
                temp_char += char
                temp_words.append(temp_char)
                temp_char = ''
            else:
                temp_char += char
    return text_list, res_list


if __name__ == '__main__':
    get_to_text()





